"""
抓取prl导读中的文章和editor suggestions
"""

#只使用原生
import time
from random import random
from http.client import HTTPSConnection
from prlparser import PRLParser
from absparser import ABSParser
from aitrans import Qwenturbo
from tyqwkey_dev import TYQW_APIKEY
from candidates_dev import CANDIDATES


ISSUE   = 132
NUMBER  = 13

PRLURL  = "journals.aps.org"
PRLAPI  = "/prl/issues/{:d}/{:d}".format(ISSUE, NUMBER)
ABSAPI  = "/prl/abstract/{:s}"

print(PRLAPI)


def headers():
    '''增加headers防止反爬虫'''
    hdr = {}
    hdr['Accept'] = 'text/html'
    hdr['User-agent'] = 'Mozilla/5.0 (Windows NT 10.0; WOW 64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36 QIHU 360SE'
    return hdr


def get_page_txt(conn, apipath):
    '''得到网页的内容'''
    conn.request("GET", apipath, headers=headers())
    resp = conn.getresponse()
    #如果被重定向
    if resp.status == 301:
        resp.read()
        print(resp.headers["Location"])
        conn.request("GET", resp.headers["Location"], headers=headers())
        resp = conn.getresponse()
    if resp.status != 200:
        raise(ConnectionError())
    return resp.read().decode("utf-8")


def assign_articles(artlist):
    '''先一人一个，然后随机分配任务'''
    asgdict = dict([(can, []) for can in CANDIDATES])
    lrem = len(artlist)
    strt = 0
    while lrem > len(CANDIDATES):
        for i, c in enumerate(CANDIDATES):
            asgdict[c].append(artlist[strt+i])
        strt += len(CANDIDATES)
        lrem -= len(CANDIDATES)
    #剩下的任务随机分配
    rand = [(random(), c) for c in CANDIDATES]
    rand = sorted(rand, key=lambda x: x[0])
    print(rand, lrem)
    for i in range(lrem):
        print(rand[i][1])
        asgdict[rand[i][1]].append(artlist[strt+i])
    return asgdict


def main():
    ''''''
    conn = HTTPSConnection(PRLURL, port=443)
    #
    htxt = get_page_txt(conn, PRLAPI)
    hpsr = PRLParser()
    hpsr.feed(htxt)
    print(hpsr.article_list)
    aa = assign_articles(hpsr.article_list)
    #
    #读取每一个文章的摘要
    #
    qt = Qwenturbo(TYQW_APIKEY)
    with open("issue{:d}number{:d}.txt".format(ISSUE, NUMBER), "w", encoding='utf-8') as f:
        for k, v in aa.items():
            print(k, ":", v)
            print("fetch {:s}'s jobs".format(k))
            f.write("{:s}:\n\n".format(k))
            for l in v:
                htxt = get_page_txt(conn, ABSAPI.format(l))
                apsr = ABSParser()
                apsr.feed(htxt)
                #等待0.5秒防止爬太快被ban
                time.sleep(0.5)
                f.write("{:s}\n".format(l))
                f.write("{:s}\n".format(apsr.ttltxt))
                f.write("{:s}\n".format(qt[apsr.ttltxt]))
                if l in hpsr.editor_suggestion:
                    f.write("editor suggestion: {:s}\n".format(hpsr.editor_suggestion[l]))
                    f.write("编辑意见: {:s}\n".format(qt[hpsr.editor_suggestion[l]]))
                f.write("{:s}\n".format(apsr.abstxt))
                f.write("{:s}\n\n".format(qt[apsr.abstxt]))
            f.write("\n")
    #qt = Qwenturbo(TYQW_APIKEY)
    #qt["如何起床"]



if __name__ == "__main__":
    main()

